#Data Analysis
import numpy as np
import pandas as pd
#Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
import plotly.express as px
%matplotlib inline
import warnings
warnings.simplefilter(action='ignore', category=Warning)
pd.set_option('display.max_columns', None)
matches = pd.read_csv(r"E:\Coding Nest\meetup\IPL2008-2019\datasets\matches.csv")
deliveries = pd.read_csv(r"E:\Coding Nest\meetup\IPL2008-2019\datasets\deliveries.csv")
print(matches.shape, deliveries.shape)
matches.head()
deliveries.head()
matches.info()
deliveries.info()
#city column
matches[matches["city"].isna()]
matches.loc[matches["city"].isna(), 'city'] = 'Dubai'
#winner column
matches[matches["winner"].isna()]
# winner and player_of_match columns
matches.loc[matches["winner"].isna(), 'winner'] = 'No Winner'
matches.loc[matches["player_of_match"].isna(),"player_of_match"] = "No Player_of_match"
# umpire1 and umpire2 column
matches[matches["umpire1"].isna()]
matches.loc[(matches["umpire1"].isna()) | (matches["id"]==5), ['umpire1','umpire2']] = ["S Ravi","VK Sharma"]
matches.loc[(matches["umpire1"].isna()) | (matches["id"]==11413), ['umpire1','umpire2']] = ['Bruce Oxenford' , 'Sundaram Ravi']
matches.drop("umpire3",axis=1,inplace=True)
matches.replace(to_replace='Rising Pune Supergiant', value='Rising Pune Supergiants', inplace=True)
deliveries.replace(to_replace='Rising Pune Supergiant', value='Rising Pune Supergiants', inplace=True)
matches.replace(to_replace='Delhi Capitals', value='Delhi Daredevils', inplace=True)
deliveries.replace(to_replace='Delhi Capitals', value='Delhi Daredevils', inplace=True)
matches.info()
total_matches_season = matches["season"].value_counts().reset_index()
total_matches_season.columns = ["Season","Matches"]
sns.barplot(x='Season',y="Matches", data=total_matches_season,palette='magma').set(title="Total number of matches in each season");
matches_played = pd.concat([matches["team1"],matches["team2"]], axis=0)
matches_played = matches_played.value_counts().reset_index()
matches_played.columns = ["Team","Total Matches Played"]
matches_played["winner"] = matches["winner"].value_counts().reset_index()["winner"]
matches_played["winning_percentage"] = matches_played["winner"]/matches_played["Total Matches Played"]
matches_played
fig,axes = plt.subplots(nrows=1,ncols=3,figsize=(24, 6))
sns.barplot(x='Team',y='Total Matches Played',data=matches_played, ax=axes[0]).set_xticklabels(
matches_played["Team"],rotation=90)
sns.barplot(x='Team',y='winner',data=matches_played, ax=axes[1]).set_xticklabels(
matches_played["Team"],rotation=90)
sns.barplot(x='Team',y='winning_percentage',data=matches_played, ax=axes[2]).set_xticklabels(
matches_played["Team"],rotation=90);
matches_city = matches['city'].value_counts().reset_index().sort_values(by='city', ascending=False)
matches_city.columns = ['City','No. of Matches']
sns.barplot(x='City',y='No. of Matches',data=matches_city).set_xticklabels(
matches_city["City"],rotation=90);
fig, axes = plt.subplots(figsize=(15, 8))
matches_venue = matches["venue"].value_counts().reset_index().sort_values(by='venue', ascending=False)
matches_venue.columns = ["Venue",'No. of Matches']
sns.barplot(x='Venue',y='No. of Matches',data=matches_venue).set_xticklabels(
matches_venue["Venue"],rotation=90);
runs = matches.merge(deliveries, left_on = 'id', right_on = 'match_id', how = 'left').drop('id', axis = 1)
total_runs_season = runs.groupby(['season'])['total_runs'].sum().reset_index()
matches_season = matches.groupby(['season']).count()["id"].reset_index()
matches_season.rename(columns={'id':'matches'},inplace=True)
matches_season["total_runs"] = total_runs_season["total_runs"]
matches_season["average_runs_per_match"] = matches_season["total_runs"]/ matches_season['matches']
matches_season
fig,axes = plt.subplots(nrows=2,ncols=1,figsize=(15, 10))
sns.barplot(x='season',y='total_runs',data=matches_season, ax=axes[0],palette='magma').set_xticklabels(
matches_season["season"],rotation=45)
sns.barplot(x='season',y='average_runs_per_match',data=matches_season, ax=axes[1],palette='magma').set_xticklabels(
matches_season["season"],rotation=45);
deliveries['boundaries'] = 0
deliveries.loc[(deliveries['batsman_runs'] >= 4) & (deliveries['batsman_runs'] != 6),'boundaries'] = 4
deliveries.loc[(deliveries['batsman_runs'] >= 6),'boundaries'] = 6
percentage_runs_boundaries = pd.merge(
deliveries[['batsman', 'batsman_runs']].groupby(by='batsman').sum().reset_index(),
deliveries[['batsman', 'boundaries']].groupby(by='batsman').sum().reset_index(),
how='outer')
percentage_runs_boundaries['run%'] = np.round(percentage_runs_boundaries['boundaries']*100/percentage_runs_boundaries['batsman_runs'],2)
percentage_runs_boundaries = percentage_runs_boundaries.sort_values(by='run%', ascending=False)
percentage_runs_boundaries.dropna(inplace=True)
percentage_runs_boundaries = percentage_runs_boundaries.sort_values(by="batsman_runs", ascending=False).head(20)
plt.figure(figsize=[20,10])
plt.bar(percentage_runs_boundaries['batsman'], percentage_runs_boundaries['batsman_runs'], label='Batsman Runs', bottom=0)
plt.bar(percentage_runs_boundaries['batsman'], percentage_runs_boundaries['boundaries'], label='Boundary Runs', bottom=0)
plt.title('Highest Number of Runs scored as Percentage of Boundaries')
plt.xlabel('Batsman')
plt.ylabel('Runs Scored')
plt.legend()
plt.xticks(rotation=45)
plt.show()
#Man_of_match
man_of_match = matches.groupby(matches["player_of_match"])["player_of_match"].count().sort_values(ascending=False).head(10)
man_of_match = man_of_match.to_frame().rename(columns={'player_of_match':'times'}).reset_index()
#century
century = deliveries.groupby(['batsman','match_id']).agg({'batsman_runs':'sum'})
century = century[century['batsman_runs']>=100]
century = century.groupby(['batsman']).agg({'count'})
century.columns = century.columns.droplevel()
century = century.sort_values(by='count',ascending=False).reset_index()
#half-century
half_century = deliveries.groupby(['batsman','match_id']).agg({'batsman_runs':'sum'})
half_century = half_century[half_century['batsman_runs']>=50]
half_century = half_century[half_century['batsman_runs']<100]
half_century = half_century.groupby(['batsman']).agg({'count'})
half_century.columns = half_century.columns.droplevel()
half_century = half_century.sort_values(by='count',ascending=False).reset_index()
#run-30's
run_30 = deliveries.groupby(['batsman','match_id']).agg({'batsman_runs':'sum'})
runs_30 = run_30[run_30['batsman_runs']>=30]
runs_30 = runs_30[runs_30['batsman_runs']<50]
runs_30 = runs_30.groupby(['batsman']).agg({'count'})
runs_30.columns = runs_30.columns.droplevel()
runs_30 = runs_30.sort_values(by='count',ascending=False).reset_index()
run_stat = pd.merge(century,half_century, on='batsman',how='right')
run_stat = pd.merge(run_stat, runs_30, on='batsman',how='right')
run_stat = run_stat.fillna(0)
#Strike Rate
strike_rate = deliveries.groupby(['batsman']).agg({'ball':'count','batsman_runs':'mean'}).sort_values(by='batsman_runs',ascending=False)
strike_rate.rename(columns ={'batsman_runs' : 'strike rate'}, inplace=True)
#Runs per match
runs_per_match = deliveries.groupby(['batsman','match_id']).agg({'batsman_runs':'sum'})
#Total Runs
total_runs = runs_per_match.groupby(['batsman']).agg({'sum' ,'mean','count'})
total_runs.rename(columns ={'sum' : 'batsman run','count' : 'match count','mean' :'average score'}, inplace=True)
total_runs.columns = total_runs.columns.droplevel()
#Sixes
sixes = deliveries[['batsman','batsman_runs']][deliveries.batsman_runs==6].groupby(['batsman']).agg({'batsman_runs':'count'})
#Fours
four = deliveries[['batsman','batsman_runs']][deliveries.batsman_runs==4].groupby(['batsman']).agg({'batsman_runs':'count'})
#Batsman stats
batsman_stat = pd.merge(pd.merge(pd.merge(strike_rate,total_runs, left_index=True, right_index=True),
sixes, left_index=True, right_index=True),four, left_index=True,
right_index=True)
batsman_stat.rename(columns = {
'ball' : 'Ball', 'strike rate':'Strike Rate',
'batsman run' : 'Batsman Run','match count' : 'Match Count',
'average score' : 'Average score' ,
'batsman_runs_x' :'Six','batsman_runs_y':'Four'
},inplace=True)
batsman_stat['Strike Rate'] = batsman_stat['Strike Rate']*100
batsman_stat = batsman_stat.sort_values(by='Batsman Run',ascending=False).reset_index()
batsman_stats = pd.merge(batsman_stat, run_stat, on='batsman', how='left').fillna(0)
batsman_stats.rename(columns = {'count_x' : '100s', 'count_y' : '50s', 'count':'30s'},inplace=True)
batsman_stats
batsman_stats["total_points"] = batsman_stats["100s"]*8 + batsman_stats["50s"]*4 + batsman_stats["30s"]*2 \
+ batsman_stats["Six"]*1 + batsman_stats["Four"]*.5 \
+ batsman_stats["Batsman Run"] *.25
batsman_stats.head()
best_batsman = batsman_stats[["batsman","total_points"]]
condition_catch = (deliveries["dismissal_kind"] == 'caught')
condition_run= (deliveries["dismissal_kind"] == 'run out')
condition_stump= (deliveries["dismissal_kind"] == 'stumped')
condition_caught_bowled = (deliveries["dismissal_kind"] == 'caught and bowled')
s_catch = deliveries.loc[condition_catch,:].groupby(deliveries.fielder).dismissal_kind.count().sort_values(ascending=False)
s_run = deliveries.loc[condition_run,:].groupby(deliveries.fielder).dismissal_kind.count().sort_values(ascending=False)
s_stump = deliveries.loc[condition_stump,:].groupby(deliveries.fielder).dismissal_kind.count().sort_values(ascending=False)
s_caught_bowled = deliveries.loc[condition_caught_bowled,:].groupby(deliveries.bowler).dismissal_kind.count().sort_values(ascending=False)
catch= s_catch.to_frame().reset_index().rename(columns ={'dismissal_kind' : 'catch'})
run= s_run.to_frame().reset_index().rename(columns ={'dismissal_kind' : 'run_out'})
stump= s_stump.to_frame().reset_index().rename(columns ={'dismissal_kind' : 'stump'})
caught_bowled = s_caught_bowled.to_frame().reset_index().rename(columns ={'dismissal_kind' : 'caught and bowled'})
field = pd.merge(pd.merge(catch,run,on='fielder', how='outer'),stump,on='fielder',how='outer')
field_stats = field[~field['fielder'].str.contains("(sub)")].reset_index().drop(['index'],axis=1).fillna(0)
field_stats
field_stats["total_points"] = field_stats["catch"]*3 + field_stats["run_out"]*2 + field_stats["stump"]*1
field_stats.head()
best_fielder = field_stats[["fielder","total_points"]]
condition = ((deliveries["dismissal_kind"].notnull()) &
(deliveries["dismissal_kind"] != 'run out') &
(deliveries["dismissal_kind"] != 'retired hurt' ) &
(deliveries["dismissal_kind"] != 'hit wicket') &
(deliveries["dismissal_kind"] != 'obstructing the field') &
(deliveries["dismissal_kind"] != 'caught and bowled'))
df_bowlers = deliveries.loc[condition,:].groupby(deliveries["bowler"])["dismissal_kind"].count().sort_values(ascending=False).reset_index()
bowlers = pd.merge(df_bowlers, caught_bowled,on='bowler',how='left').fillna(0)
high = deliveries.groupby(['match_id', 'bowler']).agg({'total_runs':'sum'}).reset_index()
over_count = deliveries.groupby(['match_id', 'bowler','over']).agg({'total_runs':'sum'}).reset_index()
overs = over_count.groupby(['match_id','bowler']).agg({'over':'count'}).reset_index()
overs = overs[overs['over']>=2]
bowlers = pd.merge( high, overs,on=['match_id', 'bowler'], how='right')
bowlers['economy'] = bowlers['total_runs']/bowlers['over']
bowlers['eco_range'] = pd.cut(bowlers['economy'], [0, 4, 5, 6, 9, 10, 11, 30],
labels=['below4', '4-5', '5-6', '6-9','9-10','10-11','above11'])
bowlers = pd.concat([bowlers, pd.get_dummies(bowlers['eco_range'], prefix='eco')], axis=1)
economy_rates = bowlers.groupby(['bowler']).agg(
{'eco_below4':'sum',
'eco_4-5':'sum',
'eco_5-6':'sum',
'eco_6-9':'sum',
'eco_9-10':'sum',
'eco_10-11':'sum',
'eco_above11':'sum'}).reset_index()
maiden_over = over_count[over_count['total_runs']==0]
maidens = maiden_over['bowler'].value_counts().to_frame().reset_index().rename({
'index':'bowler',
'bowler':'maiden_overs'},axis=1)
hauls=deliveries.groupby(['match_id', 'bowler']).agg({'player_dismissed':'count'}).reset_index()
hauls = hauls[hauls['player_dismissed']>=3]
hauls['haul'] = pd.cut(hauls['player_dismissed'], [0,3,8], labels=['3', '5'])
hauls = pd.concat([hauls,pd.get_dummies(hauls['haul'], prefix='haul')],axis=1)
hauls.drop(['player_dismissed','haul'],inplace=True,axis=1)
hauls=hauls.groupby(['bowler']).agg({'haul_3':'sum','haul_5':'sum'}).reset_index()
bowlers_stats = pd.merge(pd.merge(pd.merge(economy_rates,maidens,on='bowler', how='left'),df_bowlers,on='bowler',how='left'),hauls,on='bowler',how='right').fillna(0)
bowlers_stats.rename(columns ={'dismissal_kind' : 'wickets'},inplace=True)
bowlers_stats
bowlers_stats["total_points"] = bowlers_stats["haul_5"]*8 + bowlers_stats["haul_3"]*4 \
+ bowlers_stats["wickets"]*2 + bowlers_stats["maiden_overs"]*1\
+ bowlers_stats["eco_below4"]*.5
bowlers_stats.head()
best_bowler = bowlers_stats[["bowler","total_points"]]
best_batsman = best_batsman.rename(columns={"batsman": "player"})
best_bowler = best_bowler.rename(columns={"bowler": "player"})
best_fielder = best_fielder.rename(columns={"fielder": "player"})
best_player = pd.merge(pd.merge(best_batsman,best_bowler,on='player',how='outer'),best_fielder,on='player',how='outer')
best_player = best_player.fillna(0)
best_player
best_player['points'] = best_player['total_points']+best_player['total_points_x']\
+ best_player['total_points_y']
best_player.sort_values(by='points',ascending=False,inplace=True)
best_player=best_player.reset_index().drop(['index'],axis=1)
best_player = best_player.head(20)
best_player
trace1 = go.Bar(
x=best_player['player'],
y=best_player['total_points_x'],
name='Batting points',opacity=0.8,
marker=dict(color='lightblue'))
trace2 = go.Bar(
x=best_player['player'],
y=best_player['total_points_y'],
name='Bowling points',opacity=0.7,
marker=dict(color='gold'))
trace3 = go.Bar(
x=best_player['player'],
y=best_player['total_points'],
name='Fielding points',opacity=0.7,
marker=dict(color='lightgreen'))
data = [trace1, trace2, trace3]
layout = go.Layout(title="Points Distribution of Top Players",barmode='stack',xaxis = dict(tickmode='linear'),
yaxis = dict(title= "Points Distribution"))
fig = go.Figure(data=data, layout=layout)
fig.show()
matches_model = matches.copy()
matches_model.head()
matches_model['date'] = pd.to_datetime(matches_model['date'])
matches_model["WeekDay"] = matches_model["date"].dt.weekday
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
for col in ['city','team1','team2','toss_winner','winner','toss_decision','result','venue']:
encoder.fit(matches_model[col])
matches_model[col] = encoder.transform(matches_model[col])
matches_model.head()
# Some of the matched are affected by rain where Duckworth-Lewis method applies.
# Leaving those matches, let's try to predict.
matches_model = matches_model[matches_model["dl_applied"]==0]
matches_model.shape
matches_model.columns
X = matches_model[['season', 'city', 'team1', 'team2', 'toss_winner',
'toss_decision', 'result', 'win_by_runs','win_by_wickets', 'venue','WeekDay']]
y = matches_model['winner']
print(X.shape, y.shape)
from sklearn.model_selection import train_test_split
# Split into training and test sets
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
rf = RandomForestClassifier(n_estimators=500)
rf.fit(X_train, y_train)
y_train_preds = rf.predict(X_train)
print("Training accuracy score ", accuracy_score(y_train, y_train_preds))
print("Training Data Confusion Matrix")
print(confusion_matrix(y_train, y_train_preds))
print("Testing Data Confusion Matrix")
print("Testing accuarcy score ", accuracy_score(y_test, rf.predict(X_test)))
print(confusion_matrix(y_test, rf.predict(X_test)))